Encoding Features

This notebook explores different ways of encoding features


In [1]:
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]

In [2]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
a = Features(notebook_objs)
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder()
ae = AstorError()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ae])
a = pipe.transform(a)
print(ae.get_summary())


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x111a6ebe0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x111a6e048>
<nbminer.preprocess.get_imports.GetImports object at 0x1a1cc7b710>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a1cc7b8d0>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a1cc104a8>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1a1cc100f0>
The average length of the original strings is: 60.32411226234785
The average length of the reconstructed strings is: 48.856453073131476
The average edit distance is: 65.38658082687859
The average number of characters in common is: 17.369427035261776


In [3]:
coverage_general = []
number_templates_general = []
avg_dist_general = []
avg_sim_general = []
labels = []
for value in [1200, 500, 100, 10, 1]:
    print ('Calculating for value: ',value)
    a = Features(notebook_objs)
    gastf = GetASTFeatures()
    rbn = ResampleByNode()
    gi = GetImports()
    fe = FeatureEncoding()
    ke = KmeansEncoder(n_clusters = value)
    ae = AstorError()
    pipe = Pipeline([gastf, rbn, gi, fe, ke, ae])
    a = pipe.transform(a)
    avg_dist_general.append(ae.average_distance())
    avg_sim_general.append(ae.average_similarity())
    coverage_general.append(ae.get_percent_coverage())
    number_templates_general.append(ae.get_unique_templates())
    labels.append(ke.get_labels())


Calculating for value:  1200
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1a21751ba8>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a22157630>
<nbminer.preprocess.get_imports.GetImports object at 0x1a22157550>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a22157fd0>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a213850b8>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1a21385b38>
Calculating for value:  500
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1a21f9efd0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a2241e208>
<nbminer.preprocess.get_imports.GetImports object at 0x1a2245a0f0>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a2245a550>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a2245ad68>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1a22485e10>
Calculating for value:  100
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1a20ad9748>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a21efe128>
<nbminer.preprocess.get_imports.GetImports object at 0x1a21efe1d0>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a21efe4a8>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a21efe2b0>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1a2266e2b0>
Calculating for value:  10
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1a1e2d5fd0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a22337080>
<nbminer.preprocess.get_imports.GetImports object at 0x1a223376d8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a223278d0>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a22331358>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1a223317b8>
Calculating for value:  1
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1a1e50edd8>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a20214ac8>
<nbminer.preprocess.get_imports.GetImports object at 0x1a20214da0>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a20214978>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a202140f0>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1a20214dd8>

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 20)
x = [1200, 500, 100, 10, 1]

fig, axes = plt.subplots(2,2)
n2, = axes[0,0].plot(x, avg_dist_general, label = 'Average Distance (All calls are the same)')
axes[0,0].set_title("Average edit distance")
axes[0,0].set_xlabel('Number of templates')
axes[0,0].set_ylabel('Average edit distance')
n2, = axes[0,1].plot(x, avg_sim_general, label = 'Average Similarity (All calls are the same)')
axes[0,1].set_title("Average matching characters")
axes[0,1].set_xlabel('Number of templates')
axes[0,1].set_ylabel('Average matching characters')
n2, = axes[1,0].plot(x, coverage_general, label = 'Coverage (All calls are the same)')
axes[1,0].set_title("Coverage of templates")
axes[1,0].set_xlabel('Number of templates')
axes[1,0].set_ylabel('Coverage of templates')
n2, = axes[1,1].plot(x, number_templates_general, label = 'Number of Templates (All calls are the same)')
axes[1,1].set_title("Number of templates")
axes[1,1].set_xlabel('Number of templates')
axes[1,1].set_ylabel('Number of templates')


Out[5]:
<matplotlib.text.Text at 0x1a23aa12e8>

Cluster evaluation


In [35]:
def get_vec_sizes(v):
    r = {}
    for el in v:
        if el not in r:
            r[el] = 0
        r[el] += 1
    return (list(r.values()))
def num_one(v):
    total = 0
    for el in v:
        if el == 1:
            total += 1
    return total

In [56]:
import numpy as np
plt.rcParams['figure.figsize'] = (10, 5)

[np.median(get_vec_sizes(v)) for v in labels]
[num_one(get_vec_sizes(v)) for v in labels]

n1, = plt.plot(x[:-1],[np.median(get_vec_sizes(v)) for v in labels[:-1]], label='Median size of cluster' )
n2, = plt.plot(x,[num_one(get_vec_sizes(v)) for v in labels], label='Number of clusters with one element')
plt.legend(handles=[n1,n2])
plt.xlabel('Number of clusters')
plt.ylabel('Number of examples')


Out[56]:
<matplotlib.text.Text at 0x1a3ee22eb8>

In [71]:
plt.rcParams['figure.figsize'] = (10, 30)
fig, axes = plt.subplots(len(labels))
for i in range(len(labels)):
    axes[i].hist(get_vec_sizes(labels[i]), bins = 100)
    axes[i].set_xlabel('Number of data points')
    axes[i].set_ylabel('Number of clusters')
    title = str(x[i]) + " Clusters"
    if x[i] == 1:
        title = "1 Cluster"
    axes[i].set_title(title)



In [ ]: